这里直接读取作者给定的第2个病人的 Gene expression analysis: validation patient,用的是 10x Genomics 5’ V(D)J 平台测序.
是 25066 genes across 11071 samples.
需要自行下载安装一些必要的R包! 而且需要注意版本 Seurat
因为大量学员在中国大陆,通常不建议大家使用下面的R包安装方法,建议是切换镜像后再下载R包。
参考:http://www.bio-info-trainee.com/3727.html
# 下面代码不运行。
# Enter commands in R (or R studio, if installed)
# Install the devtools package from Hadley Wickham
install.packages('devtools')
# Replace '2.3.0' with your desired version
devtools::install_version(package = 'Seurat', version = package_version('2.3.0'))
library(Seurat)
加载R包
rm(list = ls()) # clear the environment
#load all the necessary libraries
options(warn=-1) # turn off warning message globally
suppressMessages(library(Seurat))
start_time <- Sys.time()
# 如果觉得这里较慢,可以使用 data.table 包的 fread函数。
raw_data <- read.csv('../Output_2018-03-12/GSE118056_raw.expMatrix.csv.gz', header = TRUE, row.names = 1)
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.314043 mins
# 通常电脑一分钟可以搞定。
dim(raw_data) # 11,071 cells and 25,066 genes - already filtered
## [1] 25066 11071
data <- log2(1 + sweep(raw_data, 2, median(colSums(raw_data))/colSums(raw_data), '*')) # Normalization
cellTypes <- sapply(colnames(data), function(x) ExtractField(x, 2, '[.]'))
cellTypes <- ifelse(cellTypes == '1', 'PBMC', 'Tumor')
table(cellTypes)
## cellTypes
## PBMC Tumor
## 5748 5323
简单看看表达矩阵的性质,主要是基因数量,细胞数量;以及每个细胞表达基因的数量,和每个基因在多少个细胞里面表达。
# 可以看到, 2万多的基因里面,
# 绝大部分基因只在一万多细胞的至少200个是表达的
fivenum(apply(data,1,function(x) sum(x>0) ))
## AC114752.1 CNTD1 BEX5 FAM91A1 MT-CO1
## 0 11 258 1806 11068
boxplot(apply(data,1,function(x) sum(x>0) ))
# 可以看到,一万多细胞里面
# 绝大部分细胞可以检测到2000个以上的基因。
fivenum(apply(data,2,function(x) sum(x>0) ))
## AGCTTGATCCTTGCCA.1 TCATTACGTACCTACA.1 ACGGCCACAAGAAGAG.1
## 206.0 1478.5 2242.0
## GCTCCTACAAGGTTTC.2 GGCAATTAGTGGTAAT.2
## 3695.5 9278.0
hist(apply(data,2,function(x) sum(x>0) ))
start_time <- Sys.time()
# Create Seurat object
seurat <- CreateSeuratObject(raw.data = data, project = '10x_MCC_2') # already normalized
dim(data)
## [1] 25066 11071
seurat # 25,066 genes and 11,071 cells
## An object of class seurat in project 10x_MCC_2
## 25066 genes across 11071 samples.
# 可以看到上面创建Seurat对象的那些参数并没有过滤基因或者细胞。
# Add meta.data (nUMI and cellTypes)
seurat <- AddMetaData(object = seurat, metadata = apply(raw_data, 2, sum), col.name = 'nUMI_raw')
seurat <- AddMetaData(object = seurat, metadata = cellTypes, col.name = 'cellTypes')
这里绘图,可以指定分组,前提是这个分组变量存在于meta信息里面,我们创建对象后使用函数添加了 cellTypes 属性,所以可以用来进行可视化。
这里是:‘cellTypes’,就是PMBC和tumor的区别
sce=seurat
VlnPlot(object = sce,
features.plot = c("nGene", "nUMI"),
group.by = 'cellTypes', nCol = 2)
GenePlot(object = sce, gene1 = "nUMI", gene2 = "nGene")
可以看看高表达量基因是哪些
tail(sort(Matrix::rowSums(sce@raw.data)))
## MT-CYB RPL10 MT-CO3 MT-CO2 MT-CO1 EEF1A1
## 62963.00 64713.21 69561.78 71444.01 73749.92 73919.11
## 散点图可视化任意两个基因的一些属性(通常是细胞的度量)
# 这里选取两个基因。
tmp=names(sort(Matrix::rowSums(sce@raw.data),decreasing = T))
GenePlot(object = sce, gene1 = tmp[1], gene2 = tmp[2])
# 散点图可视化任意两个细胞的一些属性(通常是基因的度量)
# 这里选取两个细胞
CellPlot(sce,sce@cell.names[3],sce@cell.names[4],do.ident = FALSE)
start_time <- Sys.time()
# 最耗费时间的步骤在这里。
seurat <- ScaleData(object = seurat, vars.to.regress = c('nUMI_raw'), model.use = 'linear', use.umi = FALSE)
## NormalizeData has not been run, therefore ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
##
## Time Elapsed: 1.00048394997915 mins
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.588338 mins
seurat <- FindVariableGenes(object = seurat,
mean.function = ExpMean,
dispersion.function = LogVMR,
x.low.cutoff = 0.05,
x.high.cutoff = 4,
y.cutoff = 0.5)
head(seurat@var.genes)
## [1] "largeTAntigen" "FO538757.3" "FO538757.2" "HES4"
## [5] "ISG15" "TNFRSF18"
length(seurat@var.genes)
## [1] 2450
# 这里使用的pcs.compute 比前面两个要多。
seurat <- RunPCA(object = seurat,
pc.genes = seurat@var.genes,
pcs.compute = 40)
## [1] "PC1"
## [1] "NNAT" "DNAJB1" "NEFM" "HSPH1"
## [5] "SOX2" "FSCN1" "SPOCK2" "DDIT4"
## [9] "NRN1" "NEFH" "RAB3A" "FABP5"
## [13] "CKS2" "HSPA1B" "ODC1" "ZWINT"
## [17] "HSPA1A" "PLA2G12A" "PDLIM1" "CCK"
## [21] "CD3D" "IL7R" "largeTAntigen" "RORA"
## [25] "SLC25A39" "NUSAP1" "SMC4" "CD69"
## [29] "HSPB1" "SYNE1"
## [1] ""
## [1] "AIF1" "SPI1" "TYMP" "FCN1" "BCL2A1" "SERPINA1"
## [7] "CTSS" "ICAM1" "S100A11" "LST1" "FCER1G" "PLEK"
## [13] "NLRP3" "CD68" "CXCL8" "TNFAIP2" "CFD" "CD83"
## [19] "PLAUR" "NEAT1" "LGALS1" "NINJ1" "CFP" "S100A8"
## [25] "CCL3L3" "CLEC7A" "LILRB2" "NAMPT" "IFNGR2" "WARS"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "NNAT" "H1F0" "FSCN1" "NEFM" "FABP5" "TXN"
## [7] "NRN1" "SOX2" "HSPH1" "NEFH" "MAFB" "GPX1"
## [13] "PDLIM1" "HSPA1B" "KLF4" "PLA2G12A" "LMNA" "PGRMC1"
## [19] "ZWINT" "RAB3A" "TSC22D1" "TPTEP1" "CKS2" "HSPA1A"
## [25] "YWHAH" "RHOB" "CCK" "ARC" "SEPT5" "CTTN"
## [1] ""
## [1] "CD7" "ZAP70" "CD247" "TBC1D10C" "CTSW" "CD69"
## [7] "TRBC1" "CST7" "GZMA" "GZMM" "BIN2" "TSC22D3"
## [13] "KLRB1" "PRKCH" "CLEC2B" "TRAF3IP3" "SKAP1" "IL2RB"
## [19] "PRF1" "MATK" "HOPX" "GNLY" "GZMB" "ARL4C"
## [25] "P2RY8" "CD3D" "FGFBP2" "WIPF1" "ARHGAP15" "SPON2"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "TUBB1" "GP9" "C6orf25" "GNG11" "PF4"
## [6] "SDPR" "CMTM5" "RGS18" "ACRBP" "F13A1"
## [11] "PTGS1" "HRAT92" "FO538757.3" "TMEM40" "PPBP"
## [16] "CLU" "TREML1" "PTCRA" "DMTN" "ESAM"
## [21] "GRAP2" "LINC00989" "HIST1H2AC" "TRIM58" "CLEC1B"
## [26] "C2orf88" "PRKAR2B" "ITGA2B" "RUFY1" "CLDN5"
## [1] ""
## [1] "MAFB" "NNAT" "IFNGR2" "FSCN1" "NAMPT" "NEFM"
## [7] "NCF1" "KLF4" "TXN" "HSPH1" "FABP5" "NRN1"
## [13] "PLAUR" "SOX2" "S100A8" "ACSL1" "TNFAIP2" "HSPA1B"
## [19] "CXCL8" "NEFH" "CD83" "ATP2B1" "CCL3L3" "S100A12"
## [25] "SGK1" "HSPA1A" "BCL2A1" "DMXL2" "HLA-DQB1" "TMEM176A"
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "CD79A" "MS4A1" "IGHM" "LINC00926" "FAM129C"
## [6] "BANK1" "FOXP1" "IL7R" "CCR7" "BLK"
## [11] "IGHD" "PIM2" "FCER2" "CD19" "CXCR4"
## [16] "MAL" "AFF3" "TCF7" "TAGAP" "LEF1"
## [21] "MYC" "BIRC3" "CD22" "SELL" "TSC22D3"
## [26] "TCL1A" "CD3D" "RHOH" "CD69" "ADAM28"
## [1] ""
## [1] "FGFBP2" "GNLY" "GZMB" "CST7" "PRF1" "SPON2"
## [7] "KLRD1" "GZMH" "HOPX" "KLRF1" "S1PR5" "GZMA"
## [13] "CLIC3" "ADGRG1" "CTSW" "PRSS23" "TTC38" "MATK"
## [19] "IL2RB" "IGFBP7" "TRDC" "SH2D1B" "FCGR3A" "FCRL6"
## [25] "TBX21" "APMAP" "CMC1" "C12orf75" "EFHD2" "RHOC"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "IL7R" "CD3D" "TCF7" "MAL" "FLT3LG"
## [6] "LEF1" "INPP4B" "CD3G" "CD6" "TC2N"
## [11] "CD5" "CD2" "RCAN3" "AQP3" "GIMAP7"
## [16] "TRAC" "SPOCK2" "SERINC5" "CD28" "TRABD2A"
## [21] "LIME1" "PIK3IP1" "LINC00861" "TRAT1" "GATA3"
## [26] "OXNAD1" "CD27" "S1PR1" "LAT" "TTC39C"
## [1] ""
## [1] "CD79A" "MS4A1" "IGHM" "LINC00926" "FAM129C"
## [6] "BANK1" "IGHD" "CD19" "BLK" "FCER2"
## [11] "ADAM28" "CD22" "TCL1A" "AFF3" "IGKC"
## [16] "RALGPS2" "FCRL1" "ARHGAP24" "VPREB3" "CD79B"
## [21] "MEF2C" "HLA-DPB1" "HLA-DQA1" "FCRLA" "KIAA0226L"
## [26] "HVCN1" "BLNK" "POU2AF1" "TNFRSF13C" "SWAP70"
## [1] ""
## [1] ""
seurat <- RunTSNE(object = seurat, dims.use = 1:10, perplexity = 50, do.fast = TRUE)
start_time <- Sys.time()
## 避免太多log日志被打印出来。
seurat <- FindClusters(object = seurat,
reduction.type = 'pca',
dims.use = 1:10,
resolution = 0.6,
print.output = 0,
k.param = 20,
save.SNN = TRUE)
TSNEPlot(seurat, group.by = 'cellTypes')
TSNEPlot(seurat,group.by = "ident",pt.shape ='cellTypes')
end_time <- Sys.time()
end_time - start_time
## Time difference of 10.80697 secs
start_time <- Sys.time()
save(seurat,file = 'patient2.seurat.output.Rdata')
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.98285 mins
# 这个步骤会输出文件
实际上最后的图也需要标记细胞类群,文章如下:
作者使用的marker基因列表也可以在文章附件找到,如下:
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] Seurat_2.3.4 Matrix_1.2-14 cowplot_0.9.3 ggplot2_3.2.0
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_1.4-0 class_7.3-14
## [4] modeltools_0.2-22 ggridges_0.5.1 mclust_5.4.1
## [7] rprojroot_1.3-2 htmlTable_1.13.1 base64enc_0.1-3
## [10] rstudioapi_0.9.0 proxy_0.4-22 npsurv_0.4-0
## [13] flexmix_2.3-14 bit64_0.9-7 codetools_0.2-15
## [16] splines_3.5.1 R.methodsS3_1.7.1 lsei_1.2-0
## [19] robustbase_0.93-3 knitr_1.21 jsonlite_1.6
## [22] Formula_1.2-3 ica_1.0-2 cluster_2.0.7-1
## [25] kernlab_0.9-27 png_0.1-7 R.oo_1.22.0
## [28] compiler_3.5.1 httr_1.3.1 backports_1.1.3
## [31] assertthat_0.2.0 lazyeval_0.2.1 lars_1.2
## [34] acepack_1.4.1 htmltools_0.3.6 tools_3.5.1
## [37] bindrcpp_0.2.2 igraph_1.2.2 gtable_0.2.0
## [40] glue_1.3.0 RANN_2.6 reshape2_1.4.3
## [43] dplyr_0.7.8 Rcpp_1.0.0 gdata_2.18.0
## [46] ape_5.2 nlme_3.1-137 iterators_1.0.10
## [49] fpc_2.2-3 gbRd_0.4-11 lmtest_0.9-36
## [52] xfun_0.4 stringr_1.3.1 irlba_2.3.2
## [55] gtools_3.8.1 DEoptimR_1.0-8 MASS_7.3-50
## [58] zoo_1.8-4 scales_1.0.0 doSNOW_1.0.16
## [61] parallel_3.5.1 RColorBrewer_1.1-2 yaml_2.2.0
## [64] reticulate_1.10 pbapply_1.3-4 gridExtra_2.3
## [67] rpart_4.1-13 segmented_0.5-3.0 latticeExtra_0.6-28
## [70] stringi_1.2.4 foreach_1.4.4 checkmate_1.9.1
## [73] caTools_1.17.1.1 bibtex_0.4.2 Rdpack_0.10-1
## [76] SDMTools_1.1-221 rlang_0.3.1 pkgconfig_2.0.2
## [79] dtw_1.20-1 prabclus_2.2-6 bitops_1.0-6
## [82] evaluate_0.12 lattice_0.20-35 ROCR_1.0-7
## [85] purrr_0.3.0 bindr_0.1.1 labeling_0.3
## [88] htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5
## [91] plyr_1.8.4 magrittr_1.5 R6_2.3.0
## [94] snow_0.4-3 gplots_3.0.1.1 Hmisc_4.2-0
## [97] pillar_1.3.1 foreign_0.8-70 withr_2.1.2
## [100] fitdistrplus_1.0-11 mixtools_1.1.0 survival_2.42-3
## [103] nnet_7.3-12 tsne_0.1-3 tibble_2.0.1
## [106] crayon_1.3.4 hdf5r_1.0.1 KernSmooth_2.23-15
## [109] rmarkdown_1.10 grid_3.5.1 data.table_1.12.0
## [112] metap_1.0 digest_0.6.18 diptest_0.75-7
## [115] tidyr_0.8.1 R.utils_2.7.0 stats4_3.5.1
## [118] munsell_0.5.0